AddFusion

逐元素求和,支持广播和激活函数

\[output_i = activation(input0_i + input1_i)\]
其中 activation 可选:
  • 0: 无激活,output = input0 + input1

  • 1: ReLU激活,output = max(0, input0 + input1)

  • 2: ReLU6激活,output = min(max(0, input0 + input1), 6)

输入:
  • input0 - 第一个输入数据地址。

  • input1 - 第二个输入数据地址。

  • output - 输出数据地址。

  • param - 参数数组(8个元素),包含以下内容:
    • param[0]: input0_dims 地址(根据维度数量,分配相应空间)

    • param[1]: input1_dims 地址(根据维度数量,分配相应空间)

    • param[2]: output_dims 地址(根据维度数量,分配相应空间)

    • param[3]: strides0 地址(需分配 8*sizeof(int) 空间)

    • param[4]: strides1 地址(需分配 8*sizeof(int) 空间)

    • param[5]: strides_output 地址(需分配 8*sizeof(int) 空间)

    • param[6]: num_dims(维度数量,最大支持8维)

    • param[7]: activation_type(激活类型:0=无激活, 1=ReLU, 2=ReLU6)

  • core_mask - 核掩码(仅共享存储版本)。

输出:
  • output - 计算结果地址。

支持平台:

FT78NE MT7004

备注

  • FT78NE 支持int8, int16, int32, fp32, fp64, cplx64, cplx128

  • MT7004 支持fp16, fp32, int16, int32, cplx64

  • cplx64 和 cplx128 不支持激活函数(activation_type 参数无效,始终执行普通加法)

  • 最大支持 8 维张量

功能说明:
  • 支持同形状加法(两个输入形状相同)

  • 支持标量广播加法(一个输入为标量)

  • 支持复杂广播加法(任意形状广播,最大8维)

共享存储版本:

void i8_add_s(int8_t *input0, int8_t *input1, int8_t *output, unsigned long long *param, int core_mask)
void i16_add_s(int16_t *input0, int16_t *input1, int16_t *output, unsigned long long *param, int core_mask)
void i32_add_s(int *input0, int *input1, int *output, unsigned long long *param, int core_mask)
void hp_add_s(half *input0, half *input1, half *output, unsigned long long *param, int core_mask)
void fp_add_s(float *input0, float *input1, float *output, unsigned long long *param, int core_mask)
void dp_add_s(double *input0, double *input1, double *output, unsigned long long *param, int core_mask)
void c64_add_s(float *input0, float *input1, float *output, unsigned long long *param, int core_mask)
void c128_add_s(double *input0, double *input1, double *output, unsigned long long *param, int core_mask)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <add.h>
 4
 5#define MAX_DIMS 8
 6
 7int main(int argc, char* argv[]) {
 8    float *input0 = (float *)0xA0000000;   // input在DDR空间
 9    float *input1 = (float *)0xA0001000;
10    float *output = (float *)0xA0002000;
11
12    int input0_dims[MAX_DIMS] = {1024, 1024};
13    int input1_dims[MAX_DIMS] = {1024, 1024};
14    int output_dims[MAX_DIMS] = {1024, 1024};
15    int strides0[MAX_DIMS], strides1[MAX_DIMS], strides_out[MAX_DIMS];
16    int num_dims = 2;
17    int activation_type = 0;  // 无激活
18
19    unsigned long long param[8];
20    param[0] = (unsigned long long)input0_dims;
21    param[1] = (unsigned long long)input1_dims;
22    param[2] = (unsigned long long)output_dims;
23    param[3] = (unsigned long long)strides0;
24    param[4] = (unsigned long long)strides1;
25    param[5] = (unsigned long long)strides_out;
26    param[6] = (unsigned long long)num_dims;
27    param[7] = (unsigned long long)activation_type;
28
29    int core_mask = 0xff;
30    fp_add_s(input0, input1, output, param, core_mask);
31    return 0;
32}

私有存储版本:

void i8_add_p(int8_t *input0, int8_t *input1, int8_t *output, unsigned long long *param)
void i16_add_p(int16_t *input0, int16_t *input1, int16_t *output, unsigned long long *param)
void i32_add_p(int *input0, int *input1, int *output, unsigned long long *param)
void hp_add_p(half *input0, half *input1, half *output, unsigned long long *param)
void fp_add_p(float *input0, float *input1, float *output, unsigned long long *param)
void dp_add_p(double *input0, double *input1, double *output, unsigned long long *param)
void c64_add_p(float *input0, float *input1, float *output, unsigned long long *param)
void c128_add_p(double *input0, double *input1, double *output, unsigned long long *param)

C调用示例:

 1//FT78NE示例
 2#include <stdio.h>
 3#include <add.h>
 4
 5#define MAX_DIMS 8
 6
 7int main(int argc, char* argv[]) {
 8    float *input0 = (float *)0x10800000;   // input在L2空间
 9    float *input1 = (float *)0x10801000;
10    float *output = (float *)0x10802000;
11
12    int input0_dims[MAX_DIMS] = {1024, 1024};
13    int input1_dims[MAX_DIMS] = {1024, 1024};
14    int output_dims[MAX_DIMS] = {1024, 1024};
15    int strides0[MAX_DIMS], strides1[MAX_DIMS], strides_out[MAX_DIMS];
16    int num_dims = 2;
17    int activation_type = 1;  // ReLU激活
18
19    unsigned long long param[8];
20    param[0] = (unsigned long long)input0_dims;
21    param[1] = (unsigned long long)input1_dims;
22    param[2] = (unsigned long long)output_dims;
23    param[3] = (unsigned long long)strides0;
24    param[4] = (unsigned long long)strides1;
25    param[5] = (unsigned long long)strides_out;
26    param[6] = (unsigned long long)num_dims;
27    param[7] = (unsigned long long)activation_type;
28
29    fp_add_p(input0, input1, output, param);
30    return 0;
31}